xref: /freebsd/sys/crypto/openssl/aarch64/poly1305-armv8.S (revision 1719886f6d08408b834d270c59ffcfd821c8f63a)
1/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */
2#include "arm_arch.h"
3
4.text
5
6// forward "declarations" are required for Apple
7
8.hidden	OPENSSL_armcap_P
9.globl	poly1305_init
10.hidden	poly1305_init
11.globl	poly1305_blocks
12.hidden	poly1305_blocks
13.globl	poly1305_emit
14.hidden	poly1305_emit
15
16.type	poly1305_init,%function
17.align	5
18poly1305_init:
19	AARCH64_VALID_CALL_TARGET
20	cmp	x1,xzr
21	stp	xzr,xzr,[x0]		// zero hash value
22	stp	xzr,xzr,[x0,#16]	// [along with is_base2_26]
23
24	csel	x0,xzr,x0,eq
25	b.eq	.Lno_key
26
27	adrp	x17,OPENSSL_armcap_P
28	ldr	w17,[x17,#:lo12:OPENSSL_armcap_P]
29
30	ldp	x7,x8,[x1]		// load key
31	mov	x9,#0xfffffffc0fffffff
32	movk	x9,#0x0fff,lsl#48
33#ifdef	__AARCH64EB__
34	rev	x7,x7			// flip bytes
35	rev	x8,x8
36#endif
37	and	x7,x7,x9		// &=0ffffffc0fffffff
38	and	x9,x9,#-4
39	and	x8,x8,x9		// &=0ffffffc0ffffffc
40	stp	x7,x8,[x0,#32]	// save key value
41
42	tst	w17,#ARMV7_NEON
43
44	adr	x12,.Lpoly1305_blocks
45	adr	x7,.Lpoly1305_blocks_neon
46	adr	x13,.Lpoly1305_emit
47	adr	x8,.Lpoly1305_emit_neon
48
49	csel	x12,x12,x7,eq
50	csel	x13,x13,x8,eq
51
52#ifdef	__ILP32__
53	stp	w12,w13,[x2]
54#else
55	stp	x12,x13,[x2]
56#endif
57
58	mov	x0,#1
59.Lno_key:
60	ret
61.size	poly1305_init,.-poly1305_init
62
63.type	poly1305_blocks,%function
64.align	5
65poly1305_blocks:
66.Lpoly1305_blocks:
67	// The symbol .Lpoly1305_blocks is not a .globl symbol
68	// but a pointer to it is returned by poly1305_init
69	AARCH64_VALID_CALL_TARGET
70	ands	x2,x2,#-16
71	b.eq	.Lno_data
72
73	ldp	x4,x5,[x0]		// load hash value
74	ldp	x7,x8,[x0,#32]	// load key value
75	ldr	x6,[x0,#16]
76	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
77	b	.Loop
78
79.align	5
80.Loop:
81	ldp	x10,x11,[x1],#16	// load input
82	sub	x2,x2,#16
83#ifdef	__AARCH64EB__
84	rev	x10,x10
85	rev	x11,x11
86#endif
87	adds	x4,x4,x10		// accumulate input
88	adcs	x5,x5,x11
89
90	mul	x12,x4,x7		// h0*r0
91	adc	x6,x6,x3
92	umulh	x13,x4,x7
93
94	mul	x10,x5,x9		// h1*5*r1
95	umulh	x11,x5,x9
96
97	adds	x12,x12,x10
98	mul	x10,x4,x8		// h0*r1
99	adc	x13,x13,x11
100	umulh	x14,x4,x8
101
102	adds	x13,x13,x10
103	mul	x10,x5,x7		// h1*r0
104	adc	x14,x14,xzr
105	umulh	x11,x5,x7
106
107	adds	x13,x13,x10
108	mul	x10,x6,x9		// h2*5*r1
109	adc	x14,x14,x11
110	mul	x11,x6,x7		// h2*r0
111
112	adds	x13,x13,x10
113	adc	x14,x14,x11
114
115	and	x10,x14,#-4		// final reduction
116	and	x6,x14,#3
117	add	x10,x10,x14,lsr#2
118	adds	x4,x12,x10
119	adcs	x5,x13,xzr
120	adc	x6,x6,xzr
121
122	cbnz	x2,.Loop
123
124	stp	x4,x5,[x0]		// store hash value
125	str	x6,[x0,#16]
126
127.Lno_data:
128	ret
129.size	poly1305_blocks,.-poly1305_blocks
130
131.type	poly1305_emit,%function
132.align	5
133poly1305_emit:
134.Lpoly1305_emit:
135	// The symbol .poly1305_emit is not a .globl symbol
136	// but a pointer to it is returned by poly1305_init
137	AARCH64_VALID_CALL_TARGET
138	ldp	x4,x5,[x0]		// load hash base 2^64
139	ldr	x6,[x0,#16]
140	ldp	x10,x11,[x2]	// load nonce
141
142	adds	x12,x4,#5		// compare to modulus
143	adcs	x13,x5,xzr
144	adc	x14,x6,xzr
145
146	tst	x14,#-4			// see if it's carried/borrowed
147
148	csel	x4,x4,x12,eq
149	csel	x5,x5,x13,eq
150
151#ifdef	__AARCH64EB__
152	ror	x10,x10,#32		// flip nonce words
153	ror	x11,x11,#32
154#endif
155	adds	x4,x4,x10		// accumulate nonce
156	adc	x5,x5,x11
157#ifdef	__AARCH64EB__
158	rev	x4,x4			// flip output bytes
159	rev	x5,x5
160#endif
161	stp	x4,x5,[x1]		// write result
162
163	ret
164.size	poly1305_emit,.-poly1305_emit
165.type	poly1305_mult,%function
166.align	5
167poly1305_mult:
168	mul	x12,x4,x7		// h0*r0
169	umulh	x13,x4,x7
170
171	mul	x10,x5,x9		// h1*5*r1
172	umulh	x11,x5,x9
173
174	adds	x12,x12,x10
175	mul	x10,x4,x8		// h0*r1
176	adc	x13,x13,x11
177	umulh	x14,x4,x8
178
179	adds	x13,x13,x10
180	mul	x10,x5,x7		// h1*r0
181	adc	x14,x14,xzr
182	umulh	x11,x5,x7
183
184	adds	x13,x13,x10
185	mul	x10,x6,x9		// h2*5*r1
186	adc	x14,x14,x11
187	mul	x11,x6,x7		// h2*r0
188
189	adds	x13,x13,x10
190	adc	x14,x14,x11
191
192	and	x10,x14,#-4		// final reduction
193	and	x6,x14,#3
194	add	x10,x10,x14,lsr#2
195	adds	x4,x12,x10
196	adcs	x5,x13,xzr
197	adc	x6,x6,xzr
198
199	ret
200.size	poly1305_mult,.-poly1305_mult
201
202.type	poly1305_splat,%function
203.align	5
204poly1305_splat:
205	and	x12,x4,#0x03ffffff	// base 2^64 -> base 2^26
206	ubfx	x13,x4,#26,#26
207	extr	x14,x5,x4,#52
208	and	x14,x14,#0x03ffffff
209	ubfx	x15,x5,#14,#26
210	extr	x16,x6,x5,#40
211
212	str	w12,[x0,#16*0]	// r0
213	add	w12,w13,w13,lsl#2	// r1*5
214	str	w13,[x0,#16*1]	// r1
215	add	w13,w14,w14,lsl#2	// r2*5
216	str	w12,[x0,#16*2]	// s1
217	str	w14,[x0,#16*3]	// r2
218	add	w14,w15,w15,lsl#2	// r3*5
219	str	w13,[x0,#16*4]	// s2
220	str	w15,[x0,#16*5]	// r3
221	add	w15,w16,w16,lsl#2	// r4*5
222	str	w14,[x0,#16*6]	// s3
223	str	w16,[x0,#16*7]	// r4
224	str	w15,[x0,#16*8]	// s4
225
226	ret
227.size	poly1305_splat,.-poly1305_splat
228
229.type	poly1305_blocks_neon,%function
230.align	5
231poly1305_blocks_neon:
232.Lpoly1305_blocks_neon:
233	// The symbol .Lpoly1305_blocks_neon is not a .globl symbol
234	// but a pointer to it is returned by poly1305_init
235	AARCH64_VALID_CALL_TARGET
236	ldr	x17,[x0,#24]
237	cmp	x2,#128
238	b.hs	.Lblocks_neon
239	cbz	x17,.Lpoly1305_blocks
240
241.Lblocks_neon:
242	AARCH64_SIGN_LINK_REGISTER
243	stp	x29,x30,[sp,#-80]!
244	add	x29,sp,#0
245
246	ands	x2,x2,#-16
247	b.eq	.Lno_data_neon
248
249	cbz	x17,.Lbase2_64_neon
250
251	ldp	w10,w11,[x0]		// load hash value base 2^26
252	ldp	w12,w13,[x0,#8]
253	ldr	w14,[x0,#16]
254
255	tst	x2,#31
256	b.eq	.Leven_neon
257
258	ldp	x7,x8,[x0,#32]	// load key value
259
260	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
261	lsr	x5,x12,#12
262	adds	x4,x4,x12,lsl#52
263	add	x5,x5,x13,lsl#14
264	adc	x5,x5,xzr
265	lsr	x6,x14,#24
266	adds	x5,x5,x14,lsl#40
267	adc	x14,x6,xzr		// can be partially reduced...
268
269	ldp	x12,x13,[x1],#16	// load input
270	sub	x2,x2,#16
271	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
272
273	and	x10,x14,#-4		// ... so reduce
274	and	x6,x14,#3
275	add	x10,x10,x14,lsr#2
276	adds	x4,x4,x10
277	adcs	x5,x5,xzr
278	adc	x6,x6,xzr
279
280#ifdef	__AARCH64EB__
281	rev	x12,x12
282	rev	x13,x13
283#endif
284	adds	x4,x4,x12		// accumulate input
285	adcs	x5,x5,x13
286	adc	x6,x6,x3
287
288	bl	poly1305_mult
289	ldr	x30,[sp,#8]
290
291	cbz	x3,.Lstore_base2_64_neon
292
293	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
294	ubfx	x11,x4,#26,#26
295	extr	x12,x5,x4,#52
296	and	x12,x12,#0x03ffffff
297	ubfx	x13,x5,#14,#26
298	extr	x14,x6,x5,#40
299
300	cbnz	x2,.Leven_neon
301
302	stp	w10,w11,[x0]		// store hash value base 2^26
303	stp	w12,w13,[x0,#8]
304	str	w14,[x0,#16]
305	b	.Lno_data_neon
306
307.align	4
308.Lstore_base2_64_neon:
309	stp	x4,x5,[x0]		// store hash value base 2^64
310	stp	x6,xzr,[x0,#16]	// note that is_base2_26 is zeroed
311	b	.Lno_data_neon
312
313.align	4
314.Lbase2_64_neon:
315	ldp	x7,x8,[x0,#32]	// load key value
316
317	ldp	x4,x5,[x0]		// load hash value base 2^64
318	ldr	x6,[x0,#16]
319
320	tst	x2,#31
321	b.eq	.Linit_neon
322
323	ldp	x12,x13,[x1],#16	// load input
324	sub	x2,x2,#16
325	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
326#ifdef	__AARCH64EB__
327	rev	x12,x12
328	rev	x13,x13
329#endif
330	adds	x4,x4,x12		// accumulate input
331	adcs	x5,x5,x13
332	adc	x6,x6,x3
333
334	bl	poly1305_mult
335
336.Linit_neon:
337	and	x10,x4,#0x03ffffff	// base 2^64 -> base 2^26
338	ubfx	x11,x4,#26,#26
339	extr	x12,x5,x4,#52
340	and	x12,x12,#0x03ffffff
341	ubfx	x13,x5,#14,#26
342	extr	x14,x6,x5,#40
343
344	stp	d8,d9,[sp,#16]		// meet ABI requirements
345	stp	d10,d11,[sp,#32]
346	stp	d12,d13,[sp,#48]
347	stp	d14,d15,[sp,#64]
348
349	fmov	d24,x10
350	fmov	d25,x11
351	fmov	d26,x12
352	fmov	d27,x13
353	fmov	d28,x14
354
355	////////////////////////////////// initialize r^n table
356	mov	x4,x7			// r^1
357	add	x9,x8,x8,lsr#2	// s1 = r1 + (r1 >> 2)
358	mov	x5,x8
359	mov	x6,xzr
360	add	x0,x0,#48+12
361	bl	poly1305_splat
362
363	bl	poly1305_mult		// r^2
364	sub	x0,x0,#4
365	bl	poly1305_splat
366
367	bl	poly1305_mult		// r^3
368	sub	x0,x0,#4
369	bl	poly1305_splat
370
371	bl	poly1305_mult		// r^4
372	sub	x0,x0,#4
373	bl	poly1305_splat
374	ldr	x30,[sp,#8]
375
376	add	x16,x1,#32
377	adr	x17,.Lzeros
378	subs	x2,x2,#64
379	csel	x16,x17,x16,lo
380
381	mov	x4,#1
382	stur	x4,[x0,#-24]		// set is_base2_26
383	sub	x0,x0,#48		// restore original x0
384	b	.Ldo_neon
385
386.align	4
387.Leven_neon:
388	add	x16,x1,#32
389	adr	x17,.Lzeros
390	subs	x2,x2,#64
391	csel	x16,x17,x16,lo
392
393	stp	d8,d9,[sp,#16]		// meet ABI requirements
394	stp	d10,d11,[sp,#32]
395	stp	d12,d13,[sp,#48]
396	stp	d14,d15,[sp,#64]
397
398	fmov	d24,x10
399	fmov	d25,x11
400	fmov	d26,x12
401	fmov	d27,x13
402	fmov	d28,x14
403
404.Ldo_neon:
405	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
406	ldp	x9,x13,[x16],#48
407
408	lsl	x3,x3,#24
409	add	x15,x0,#48
410
411#ifdef	__AARCH64EB__
412	rev	x8,x8
413	rev	x12,x12
414	rev	x9,x9
415	rev	x13,x13
416#endif
417	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
418	and	x5,x9,#0x03ffffff
419	ubfx	x6,x8,#26,#26
420	ubfx	x7,x9,#26,#26
421	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
422	extr	x8,x12,x8,#52
423	extr	x9,x13,x9,#52
424	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
425	fmov	d14,x4
426	and	x8,x8,#0x03ffffff
427	and	x9,x9,#0x03ffffff
428	ubfx	x10,x12,#14,#26
429	ubfx	x11,x13,#14,#26
430	add	x12,x3,x12,lsr#40
431	add	x13,x3,x13,lsr#40
432	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
433	fmov	d15,x6
434	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
435	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
436	fmov	d16,x8
437	fmov	d17,x10
438	fmov	d18,x12
439
440	ldp	x8,x12,[x1],#16	// inp[0:1]
441	ldp	x9,x13,[x1],#48
442
443	ld1	{v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64
444	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64
445	ld1	{v8.4s},[x15]
446
447#ifdef	__AARCH64EB__
448	rev	x8,x8
449	rev	x12,x12
450	rev	x9,x9
451	rev	x13,x13
452#endif
453	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
454	and	x5,x9,#0x03ffffff
455	ubfx	x6,x8,#26,#26
456	ubfx	x7,x9,#26,#26
457	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
458	extr	x8,x12,x8,#52
459	extr	x9,x13,x9,#52
460	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
461	fmov	d9,x4
462	and	x8,x8,#0x03ffffff
463	and	x9,x9,#0x03ffffff
464	ubfx	x10,x12,#14,#26
465	ubfx	x11,x13,#14,#26
466	add	x12,x3,x12,lsr#40
467	add	x13,x3,x13,lsr#40
468	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
469	fmov	d10,x6
470	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
471	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
472	movi	v31.2d,#-1
473	fmov	d11,x8
474	fmov	d12,x10
475	fmov	d13,x12
476	ushr	v31.2d,v31.2d,#38
477
478	b.ls	.Lskip_loop
479
480.align	4
481.Loop_neon:
482	////////////////////////////////////////////////////////////////
483	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
484	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
485	//   ___________________/
486	// ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
487	// ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
488	//   ___________________/ ____________________/
489	//
490	// Note that we start with inp[2:3]*r^2. This is because it
491	// doesn't depend on reduction in previous iteration.
492	////////////////////////////////////////////////////////////////
493	// d4 = h0*r4 + h1*r3   + h2*r2   + h3*r1   + h4*r0
494	// d3 = h0*r3 + h1*r2   + h2*r1   + h3*r0   + h4*5*r4
495	// d2 = h0*r2 + h1*r1   + h2*r0   + h3*5*r4 + h4*5*r3
496	// d1 = h0*r1 + h1*r0   + h2*5*r4 + h3*5*r3 + h4*5*r2
497	// d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1
498
499	subs	x2,x2,#64
500	umull	v23.2d,v14.2s,v7.s[2]
501	csel	x16,x17,x16,lo
502	umull	v22.2d,v14.2s,v5.s[2]
503	umull	v21.2d,v14.2s,v3.s[2]
504	ldp	x8,x12,[x16],#16	// inp[2:3] (or zero)
505	umull	v20.2d,v14.2s,v1.s[2]
506	ldp	x9,x13,[x16],#48
507	umull	v19.2d,v14.2s,v0.s[2]
508#ifdef	__AARCH64EB__
509	rev	x8,x8
510	rev	x12,x12
511	rev	x9,x9
512	rev	x13,x13
513#endif
514
515	umlal	v23.2d,v15.2s,v5.s[2]
516	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
517	umlal	v22.2d,v15.2s,v3.s[2]
518	and	x5,x9,#0x03ffffff
519	umlal	v21.2d,v15.2s,v1.s[2]
520	ubfx	x6,x8,#26,#26
521	umlal	v20.2d,v15.2s,v0.s[2]
522	ubfx	x7,x9,#26,#26
523	umlal	v19.2d,v15.2s,v8.s[2]
524	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
525
526	umlal	v23.2d,v16.2s,v3.s[2]
527	extr	x8,x12,x8,#52
528	umlal	v22.2d,v16.2s,v1.s[2]
529	extr	x9,x13,x9,#52
530	umlal	v21.2d,v16.2s,v0.s[2]
531	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
532	umlal	v20.2d,v16.2s,v8.s[2]
533	fmov	d14,x4
534	umlal	v19.2d,v16.2s,v6.s[2]
535	and	x8,x8,#0x03ffffff
536
537	umlal	v23.2d,v17.2s,v1.s[2]
538	and	x9,x9,#0x03ffffff
539	umlal	v22.2d,v17.2s,v0.s[2]
540	ubfx	x10,x12,#14,#26
541	umlal	v21.2d,v17.2s,v8.s[2]
542	ubfx	x11,x13,#14,#26
543	umlal	v20.2d,v17.2s,v6.s[2]
544	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
545	umlal	v19.2d,v17.2s,v4.s[2]
546	fmov	d15,x6
547
548	add	v11.2s,v11.2s,v26.2s
549	add	x12,x3,x12,lsr#40
550	umlal	v23.2d,v18.2s,v0.s[2]
551	add	x13,x3,x13,lsr#40
552	umlal	v22.2d,v18.2s,v8.s[2]
553	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
554	umlal	v21.2d,v18.2s,v6.s[2]
555	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
556	umlal	v20.2d,v18.2s,v4.s[2]
557	fmov	d16,x8
558	umlal	v19.2d,v18.2s,v2.s[2]
559	fmov	d17,x10
560
561	////////////////////////////////////////////////////////////////
562	// (hash+inp[0:1])*r^4 and accumulate
563
564	add	v9.2s,v9.2s,v24.2s
565	fmov	d18,x12
566	umlal	v22.2d,v11.2s,v1.s[0]
567	ldp	x8,x12,[x1],#16	// inp[0:1]
568	umlal	v19.2d,v11.2s,v6.s[0]
569	ldp	x9,x13,[x1],#48
570	umlal	v23.2d,v11.2s,v3.s[0]
571	umlal	v20.2d,v11.2s,v8.s[0]
572	umlal	v21.2d,v11.2s,v0.s[0]
573#ifdef	__AARCH64EB__
574	rev	x8,x8
575	rev	x12,x12
576	rev	x9,x9
577	rev	x13,x13
578#endif
579
580	add	v10.2s,v10.2s,v25.2s
581	umlal	v22.2d,v9.2s,v5.s[0]
582	umlal	v23.2d,v9.2s,v7.s[0]
583	and	x4,x8,#0x03ffffff	// base 2^64 -> base 2^26
584	umlal	v21.2d,v9.2s,v3.s[0]
585	and	x5,x9,#0x03ffffff
586	umlal	v19.2d,v9.2s,v0.s[0]
587	ubfx	x6,x8,#26,#26
588	umlal	v20.2d,v9.2s,v1.s[0]
589	ubfx	x7,x9,#26,#26
590
591	add	v12.2s,v12.2s,v27.2s
592	add	x4,x4,x5,lsl#32		// bfi	x4,x5,#32,#32
593	umlal	v22.2d,v10.2s,v3.s[0]
594	extr	x8,x12,x8,#52
595	umlal	v23.2d,v10.2s,v5.s[0]
596	extr	x9,x13,x9,#52
597	umlal	v19.2d,v10.2s,v8.s[0]
598	add	x6,x6,x7,lsl#32		// bfi	x6,x7,#32,#32
599	umlal	v21.2d,v10.2s,v1.s[0]
600	fmov	d9,x4
601	umlal	v20.2d,v10.2s,v0.s[0]
602	and	x8,x8,#0x03ffffff
603
604	add	v13.2s,v13.2s,v28.2s
605	and	x9,x9,#0x03ffffff
606	umlal	v22.2d,v12.2s,v0.s[0]
607	ubfx	x10,x12,#14,#26
608	umlal	v19.2d,v12.2s,v4.s[0]
609	ubfx	x11,x13,#14,#26
610	umlal	v23.2d,v12.2s,v1.s[0]
611	add	x8,x8,x9,lsl#32		// bfi	x8,x9,#32,#32
612	umlal	v20.2d,v12.2s,v6.s[0]
613	fmov	d10,x6
614	umlal	v21.2d,v12.2s,v8.s[0]
615	add	x12,x3,x12,lsr#40
616
617	umlal	v22.2d,v13.2s,v8.s[0]
618	add	x13,x3,x13,lsr#40
619	umlal	v19.2d,v13.2s,v2.s[0]
620	add	x10,x10,x11,lsl#32	// bfi	x10,x11,#32,#32
621	umlal	v23.2d,v13.2s,v0.s[0]
622	add	x12,x12,x13,lsl#32	// bfi	x12,x13,#32,#32
623	umlal	v20.2d,v13.2s,v4.s[0]
624	fmov	d11,x8
625	umlal	v21.2d,v13.2s,v6.s[0]
626	fmov	d12,x10
627	fmov	d13,x12
628
629	/////////////////////////////////////////////////////////////////
630	// lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
631	// and P. Schwabe
632	//
633	// [see discussion in poly1305-armv4 module]
634
635	ushr	v29.2d,v22.2d,#26
636	xtn	v27.2s,v22.2d
637	ushr	v30.2d,v19.2d,#26
638	and	v19.16b,v19.16b,v31.16b
639	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
640	bic	v27.2s,#0xfc,lsl#24	// &=0x03ffffff
641	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
642
643	ushr	v29.2d,v23.2d,#26
644	xtn	v28.2s,v23.2d
645	ushr	v30.2d,v20.2d,#26
646	xtn	v25.2s,v20.2d
647	bic	v28.2s,#0xfc,lsl#24
648	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
649
650	add	v19.2d,v19.2d,v29.2d
651	shl	v29.2d,v29.2d,#2
652	shrn	v30.2s,v21.2d,#26
653	xtn	v26.2s,v21.2d
654	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
655	bic	v25.2s,#0xfc,lsl#24
656	add	v27.2s,v27.2s,v30.2s		// h2 -> h3
657	bic	v26.2s,#0xfc,lsl#24
658
659	shrn	v29.2s,v19.2d,#26
660	xtn	v24.2s,v19.2d
661	ushr	v30.2s,v27.2s,#26
662	bic	v27.2s,#0xfc,lsl#24
663	bic	v24.2s,#0xfc,lsl#24
664	add	v25.2s,v25.2s,v29.2s		// h0 -> h1
665	add	v28.2s,v28.2s,v30.2s		// h3 -> h4
666
667	b.hi	.Loop_neon
668
669.Lskip_loop:
670	dup	v16.2d,v16.d[0]
671	add	v11.2s,v11.2s,v26.2s
672
673	////////////////////////////////////////////////////////////////
674	// multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
675
676	adds	x2,x2,#32
677	b.ne	.Long_tail
678
679	dup	v16.2d,v11.d[0]
680	add	v14.2s,v9.2s,v24.2s
681	add	v17.2s,v12.2s,v27.2s
682	add	v15.2s,v10.2s,v25.2s
683	add	v18.2s,v13.2s,v28.2s
684
685.Long_tail:
686	dup	v14.2d,v14.d[0]
687	umull2	v19.2d,v16.4s,v6.4s
688	umull2	v22.2d,v16.4s,v1.4s
689	umull2	v23.2d,v16.4s,v3.4s
690	umull2	v21.2d,v16.4s,v0.4s
691	umull2	v20.2d,v16.4s,v8.4s
692
693	dup	v15.2d,v15.d[0]
694	umlal2	v19.2d,v14.4s,v0.4s
695	umlal2	v21.2d,v14.4s,v3.4s
696	umlal2	v22.2d,v14.4s,v5.4s
697	umlal2	v23.2d,v14.4s,v7.4s
698	umlal2	v20.2d,v14.4s,v1.4s
699
700	dup	v17.2d,v17.d[0]
701	umlal2	v19.2d,v15.4s,v8.4s
702	umlal2	v22.2d,v15.4s,v3.4s
703	umlal2	v21.2d,v15.4s,v1.4s
704	umlal2	v23.2d,v15.4s,v5.4s
705	umlal2	v20.2d,v15.4s,v0.4s
706
707	dup	v18.2d,v18.d[0]
708	umlal2	v22.2d,v17.4s,v0.4s
709	umlal2	v23.2d,v17.4s,v1.4s
710	umlal2	v19.2d,v17.4s,v4.4s
711	umlal2	v20.2d,v17.4s,v6.4s
712	umlal2	v21.2d,v17.4s,v8.4s
713
714	umlal2	v22.2d,v18.4s,v8.4s
715	umlal2	v19.2d,v18.4s,v2.4s
716	umlal2	v23.2d,v18.4s,v0.4s
717	umlal2	v20.2d,v18.4s,v4.4s
718	umlal2	v21.2d,v18.4s,v6.4s
719
720	b.eq	.Lshort_tail
721
722	////////////////////////////////////////////////////////////////
723	// (hash+inp[0:1])*r^4:r^3 and accumulate
724
725	add	v9.2s,v9.2s,v24.2s
726	umlal	v22.2d,v11.2s,v1.2s
727	umlal	v19.2d,v11.2s,v6.2s
728	umlal	v23.2d,v11.2s,v3.2s
729	umlal	v20.2d,v11.2s,v8.2s
730	umlal	v21.2d,v11.2s,v0.2s
731
732	add	v10.2s,v10.2s,v25.2s
733	umlal	v22.2d,v9.2s,v5.2s
734	umlal	v19.2d,v9.2s,v0.2s
735	umlal	v23.2d,v9.2s,v7.2s
736	umlal	v20.2d,v9.2s,v1.2s
737	umlal	v21.2d,v9.2s,v3.2s
738
739	add	v12.2s,v12.2s,v27.2s
740	umlal	v22.2d,v10.2s,v3.2s
741	umlal	v19.2d,v10.2s,v8.2s
742	umlal	v23.2d,v10.2s,v5.2s
743	umlal	v20.2d,v10.2s,v0.2s
744	umlal	v21.2d,v10.2s,v1.2s
745
746	add	v13.2s,v13.2s,v28.2s
747	umlal	v22.2d,v12.2s,v0.2s
748	umlal	v19.2d,v12.2s,v4.2s
749	umlal	v23.2d,v12.2s,v1.2s
750	umlal	v20.2d,v12.2s,v6.2s
751	umlal	v21.2d,v12.2s,v8.2s
752
753	umlal	v22.2d,v13.2s,v8.2s
754	umlal	v19.2d,v13.2s,v2.2s
755	umlal	v23.2d,v13.2s,v0.2s
756	umlal	v20.2d,v13.2s,v4.2s
757	umlal	v21.2d,v13.2s,v6.2s
758
759.Lshort_tail:
760	////////////////////////////////////////////////////////////////
761	// horizontal add
762
763	addp	v22.2d,v22.2d,v22.2d
764	ldp	d8,d9,[sp,#16]		// meet ABI requirements
765	addp	v19.2d,v19.2d,v19.2d
766	ldp	d10,d11,[sp,#32]
767	addp	v23.2d,v23.2d,v23.2d
768	ldp	d12,d13,[sp,#48]
769	addp	v20.2d,v20.2d,v20.2d
770	ldp	d14,d15,[sp,#64]
771	addp	v21.2d,v21.2d,v21.2d
772
773	////////////////////////////////////////////////////////////////
774	// lazy reduction, but without narrowing
775
776	ushr	v29.2d,v22.2d,#26
777	and	v22.16b,v22.16b,v31.16b
778	ushr	v30.2d,v19.2d,#26
779	and	v19.16b,v19.16b,v31.16b
780
781	add	v23.2d,v23.2d,v29.2d	// h3 -> h4
782	add	v20.2d,v20.2d,v30.2d	// h0 -> h1
783
784	ushr	v29.2d,v23.2d,#26
785	and	v23.16b,v23.16b,v31.16b
786	ushr	v30.2d,v20.2d,#26
787	and	v20.16b,v20.16b,v31.16b
788	add	v21.2d,v21.2d,v30.2d	// h1 -> h2
789
790	add	v19.2d,v19.2d,v29.2d
791	shl	v29.2d,v29.2d,#2
792	ushr	v30.2d,v21.2d,#26
793	and	v21.16b,v21.16b,v31.16b
794	add	v19.2d,v19.2d,v29.2d	// h4 -> h0
795	add	v22.2d,v22.2d,v30.2d	// h2 -> h3
796
797	ushr	v29.2d,v19.2d,#26
798	and	v19.16b,v19.16b,v31.16b
799	ushr	v30.2d,v22.2d,#26
800	and	v22.16b,v22.16b,v31.16b
801	add	v20.2d,v20.2d,v29.2d	// h0 -> h1
802	add	v23.2d,v23.2d,v30.2d	// h3 -> h4
803
804	////////////////////////////////////////////////////////////////
805	// write the result, can be partially reduced
806
807	st4	{v19.s,v20.s,v21.s,v22.s}[0],[x0],#16
808	st1	{v23.s}[0],[x0]
809
810.Lno_data_neon:
811	ldr	x29,[sp],#80
812	AARCH64_VALIDATE_LINK_REGISTER
813	ret
814.size	poly1305_blocks_neon,.-poly1305_blocks_neon
815
816.type	poly1305_emit_neon,%function
817.align	5
818poly1305_emit_neon:
819.Lpoly1305_emit_neon:
820	// The symbol .Lpoly1305_emit_neon is not a .globl symbol
821	// but a pointer to it is returned by poly1305_init
822	AARCH64_VALID_CALL_TARGET
823	ldr	x17,[x0,#24]
824	cbz	x17,poly1305_emit
825
826	ldp	w10,w11,[x0]		// load hash value base 2^26
827	ldp	w12,w13,[x0,#8]
828	ldr	w14,[x0,#16]
829
830	add	x4,x10,x11,lsl#26	// base 2^26 -> base 2^64
831	lsr	x5,x12,#12
832	adds	x4,x4,x12,lsl#52
833	add	x5,x5,x13,lsl#14
834	adc	x5,x5,xzr
835	lsr	x6,x14,#24
836	adds	x5,x5,x14,lsl#40
837	adc	x6,x6,xzr		// can be partially reduced...
838
839	ldp	x10,x11,[x2]	// load nonce
840
841	and	x12,x6,#-4		// ... so reduce
842	add	x12,x12,x6,lsr#2
843	and	x6,x6,#3
844	adds	x4,x4,x12
845	adcs	x5,x5,xzr
846	adc	x6,x6,xzr
847
848	adds	x12,x4,#5		// compare to modulus
849	adcs	x13,x5,xzr
850	adc	x14,x6,xzr
851
852	tst	x14,#-4			// see if it's carried/borrowed
853
854	csel	x4,x4,x12,eq
855	csel	x5,x5,x13,eq
856
857#ifdef	__AARCH64EB__
858	ror	x10,x10,#32		// flip nonce words
859	ror	x11,x11,#32
860#endif
861	adds	x4,x4,x10		// accumulate nonce
862	adc	x5,x5,x11
863#ifdef	__AARCH64EB__
864	rev	x4,x4			// flip output bytes
865	rev	x5,x5
866#endif
867	stp	x4,x5,[x1]		// write result
868
869	ret
870.size	poly1305_emit_neon,.-poly1305_emit_neon
871
872.align	5
873.Lzeros:
874.long	0,0,0,0,0,0,0,0
875.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
876.align	2
877.align	2
878